{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import sys\n",
    "sys.path.append(\"../utilities/\")\n",
    "from utility import FeatureGenerator"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Structure</th>\n",
       "      <th>Class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12dichlorotetrafluoroethane (cryofluorane)</td>\n",
       "      <td>[F]C([F])([Cl])C([F])([F])[Cl]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2methylthiophencyclidine (gacyclidine)</td>\n",
       "      <td>C[C@@]1([H])CCCC[C@@]1(N2CCCCC2)C=3[S]C=CC=3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7amphetaminoethyltheophylline (fenetylline)</td>\n",
       "      <td>C[C@@]([H])(C/C1=C/C=CC=C1)NCCN2C=NC=3N(C)C(=O...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>acaprazine</td>\n",
       "      <td>CC(=O)NCCCN1CCN(CC1)C2=CC([Cl])=C/C=C2/[Cl]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>acebrochol</td>\n",
       "      <td>CC(C)([H])CCC[C@@](C)([H])[C@@]3([H])CC[C@@]4(...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          Name  \\\n",
       "0   12dichlorotetrafluoroethane (cryofluorane)   \n",
       "1       2methylthiophencyclidine (gacyclidine)   \n",
       "2  7amphetaminoethyltheophylline (fenetylline)   \n",
       "3                                   acaprazine   \n",
       "4                                   acebrochol   \n",
       "\n",
       "                                           Structure  Class  \n",
       "0                     [F]C([F])([Cl])C([F])([F])[Cl]      1  \n",
       "1       C[C@@]1([H])CCCC[C@@]1(N2CCCCC2)C=3[S]C=CC=3      1  \n",
       "2  C[C@@]([H])(C/C1=C/C=CC=C1)NCCN2C=NC=3N(C)C(=O...      1  \n",
       "3        CC(=O)NCCCN1CCN(CC1)C2=CC([Cl])=C/C=C2/[Cl]      1  \n",
       "4  CC(C)([H])CCC[C@@](C)([H])[C@@]3([H])CC[C@@]4(...      1  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.read_csv(\"../datasets/blood_brain_barrier_solubility/BBB_TrainingSet1.csv\")\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Structure</th>\n",
       "      <th>Class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>acepromazine</td>\n",
       "      <td>CN(C)CCCN2C1=CC=CC=C1[S]C3=CC=C(C=C23)C(C)=O</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>aceprometazine</td>\n",
       "      <td>C[C@@]([H])(CN2C1=CC=CC=C1[S]C3=CC=C(C=C23)C(C...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>acetophenazine</td>\n",
       "      <td>CC(=O)C4=CC=C3[S]C1=CC=CC=C1N(CCCN2CCN(CCO)CC2...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>adinazolam</td>\n",
       "      <td>CN(C)C/C3=N/N=C4C/N=C(/C1=C/C=CC=C1)C2=CC([Cl]...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>albutoin</td>\n",
       "      <td>CC(C)([H])C[C@@]1([H])NC(=[S])N(CC=C)C1=O</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Name                                          Structure  Class\n",
       "0    acepromazine       CN(C)CCCN2C1=CC=CC=C1[S]C3=CC=C(C=C23)C(C)=O      1\n",
       "1  aceprometazine  C[C@@]([H])(CN2C1=CC=CC=C1[S]C3=CC=C(C=C23)C(C...      1\n",
       "2  acetophenazine  CC(=O)C4=CC=C3[S]C1=CC=CC=C1N(CCCN2CCN(CCO)CC2...      1\n",
       "3      adinazolam  CN(C)C/C3=N/N=C4C/N=C(/C1=C/C=CC=C1)C2=CC([Cl]...      1\n",
       "4        albutoin          CC(C)([H])C[C@@]1([H])NC(=[S])N(CC=C)C1=O      1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df = pd.read_csv(\"../datasets/blood_brain_barrier_solubility/BBB_TestSet1.csv\")\n",
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Structure</th>\n",
       "      <th>Class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>NC(N)=NC=1[S]/C=C(/C)N=1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14</td>\n",
       "      <td>O=N(=O)C=1C=CNC=1NCC[S]CC=2C=CC=CN=2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15</td>\n",
       "      <td>CC=1C(C)=CC(=NC=1C)C(C)(C)[S]C(C)(C)CNC=2NC=CC...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>16</td>\n",
       "      <td>NC(N)=N/C1=N/C(=C[S]1)C2=CC=CC=C2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>24</td>\n",
       "      <td>O=C(C)NCCCOC=2C=C/C=C(/CN1CCCCC1)C=2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Name                                          Structure  Class\n",
       "0    2                           NC(N)=NC=1[S]/C=C(/C)N=1      1\n",
       "1   14               O=N(=O)C=1C=CNC=1NCC[S]CC=2C=CC=CN=2      1\n",
       "2   15  CC=1C(C)=CC(=NC=1C)C(C)(C)[S]C(C)(C)CNC=2NC=CC...      1\n",
       "3   16                  NC(N)=N/C1=N/C(=C[S]1)C2=CC=CC=C2      1\n",
       "4   24               O=C(C)NCCCOC=2C=C/C=C(/CN1CCCCC1)C=2      1"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "external_df = pd.read_csv(\"../datasets/blood_brain_barrier_solubility/BBB_ExternalSet.csv\")\n",
    "external_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1093, 500, 246)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_df), len(test_df), len(external_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_features(smiles):\n",
    "    try:\n",
    "        feat_gen = FeatureGenerator(smiles)\n",
    "        features = feat_gen.toTPATF()\n",
    "        return features\n",
    "    except:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['tpatf'] = train_df.Structure.apply(get_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df['tpatf'] = test_df.Structure.apply(get_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "external_df['tpatf'] = external_df.Structure.apply(get_features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "import numpy as np\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1093, 2692) (1093,)\n",
      "(500, 2692) (500,)\n"
     ]
    }
   ],
   "source": [
    "train_x, train_y = np.array([f for f in train_df.tpatf.values], dtype=np.float32), train_df.Class.values\n",
    "test_x, test_y = np.array([f for f in test_df.tpatf.values], dtype=np.float32), test_df.Class.values\n",
    "print(train_x.shape, train_y.shape)\n",
    "print(test_x.shape, test_y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 10 candidates, totalling 50 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:    6.0s remaining:    1.3s\n",
      "[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    6.6s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise',\n",
       "       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',\n",
       "            criterion='gini', max_depth=None, max_features='auto',\n",
       "            max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "            min_impurity_split=None, min_samples_leaf=1,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,\n",
       "            verbose=0, warm_start=False),\n",
       "       fit_params=None, iid=True, n_jobs=-1,\n",
       "       param_grid={'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
       "       scoring=None, verbose=True)"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = RandomForestClassifier(class_weight=\"balanced\")\n",
    "param_grid = {\"n_estimators\": [i for i in range(100, 1001, 100)]}\n",
    "grid_clf = GridSearchCV(estimator=clf, cv=5, param_grid=param_grid, verbose=True, n_jobs=-1)\n",
    "grid_clf.fit(train_x, train_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, class_weight='balanced',\n",
       "            criterion='gini', max_depth=None, max_features='auto',\n",
       "            max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "            min_impurity_split=None, min_samples_leaf=1,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            n_estimators=700, n_jobs=1, oob_score=False, random_state=None,\n",
       "            verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = grid_clf.best_estimator_\n",
    "model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Training performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "         -1       1.00      1.00      1.00       261\n",
      "          1       1.00      1.00      1.00       832\n",
      "\n",
      "avg / total       1.00      1.00      1.00      1093\n",
      "\n"
     ]
    }
   ],
   "source": [
    "y_pred = model.predict(train_x)\n",
    "print(classification_report(y_true=train_y, y_pred=y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Test performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "         -1       0.95      0.78      0.85        49\n",
      "          1       0.98      1.00      0.99       451\n",
      "\n",
      "avg / total       0.97      0.97      0.97       500\n",
      "\n"
     ]
    }
   ],
   "source": [
    "y_pred = model.predict(test_x)\n",
    "print(classification_report(y_true=test_y, y_pred=y_pred))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
